import os
import re
import pickle
import pandas as pd
import numpy as np
import warnings
from gensim.test.utils import datapath
import pyLDAvis
import pyLDAvis.gensim
import gensim.corpora as corpora
from gensim.matutils import Sparse2Corpus
warnings.filterwarnings("ignore", category=FutureWarning)
# Load selected model
models_dir = datapath("train_models\\")
selected_model = "nb5_na04_a1_b1_models.pkl"
with open(os.path.join(models_dir, selected_model), "rb") as handle:
model = pickle.load(handle)
a1_b1_k20 = model["a1_b1_k20"]
# Load dictionary
dictionary = corpora.Dictionary.load(datapath("vocabulary\\{}".format("nb5_na04")))
# load trained bigram
filename = datapath('train_bigram\\{}_bigram.pkl'.format("nb5_na04"))
with open(filename, "rb") as f:
train_bigram = pickle.load(f)
# Load train data
df_train = pd.read_csv('.\\Datasource_backup\\sub_onetree_train.csv')
X_train = train_bigram.transform(df_train['clean_text'].tolist())
corpus = Sparse2Corpus(X_train, documents_columns=False)
top_words = a1_b1_k20.show_topics(num_topics=20, num_words=10, formatted = False)
top_words
Output: probability mass function over the words in the model for each topic
Scatter plot: distance between topics in the scatter plot is an approximation of the difference between topic distribution (approximation of the semantic relationship)
Bubble size: is the topic prevalence
Indices inside the bubbles: indicate the sorted topic prevalence. Bubble number 1 is the most popular topic to the last
Bar: list top 30 words given the topic
Red bars: frequency of each word given a topic
Gray bars: overall word frequency
How other topics use the word?: Visualize the unexplained portion of a words within a topic by simply going with the mouse over the word
Distance between circles: represents topic similarity (approx to the original topic similarity matrix, since we are using a two dimensional scatter plot). t-distributed Stochastic Neighbor Embedding and/or Multidimensional Scaling do their best to preserve the original distance.
Interactively tune relevance metric (parameter lambda) to introduce new words that are specific to the topic.
Decreasing lambda more weight on the ratio red to gray (word's frequency within the topic to the overall word's frequency). This can improve readability for those who are not familiar with the topic.
# Visualize the topics using Jensen-Shannon Divergence & t-distributed Stochastic Neighbor Em
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(a1_b1_k20, corpus, dictionary, mds='TSNE')
vis
# Visualize the topics using ensen-Shannon Divergence & Principal Coordinate Analysis(aka Classical Multidimensional Scaling)
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(a1_b1_k20, corpus, dictionary)
vis
| Index pyLDAvis | Inferred topic | LDA topic number |
|---|---|---|
| 1 | Broad topic regarding Automation - Devices - Network | 19 |
| 2 | Smart lights | 18 |
| 3 | Smart termostat | 9 |
| 4 | Home entertainment - voice assistant | 7 |
| 5 | Audio - Speakers | 4 |
| 6 | Smart Lock systems | 16 |
| 8 | Smart camera - surveillance | 15 |
| 9 | Smart plugs - power systems | 11 |
| 10 | Smart door systems | 5 |
# Find the topic number with the highest
def dominant_topic(ldamodel, corpus, document):
# init dataframe
topics_df = pd.DataFrame()
# GET MAIN TOPIC IN EACH DOCUMENT
# Get throught the pages
for num, doc in enumerate(ldamodel[corpus]):
# Count number of list into a list
if sum(isinstance(i, list) for i in doc)>0:
doc = doc[0]
doc = sorted(doc, key= lambda x: (x[1]), reverse=True)
for j, (topic_num, prop_topic) in enumerate(doc):
if j == 0: # => dominant topic
# Get list prob. * keywords from the topic
pk = ldamodel.show_topic(topic_num)
topic_keywords = ', '.join([word for word, prop in pk])
# Add topic number, probability, keywords and original text to the dataframe
topics_df = topics_df.append(pd.Series([int(topic_num), np.round(prop_topic, 4),
topic_keywords, document[num]]),
ignore_index=True)
else:
break
# Add columns name
topics_df.columns = ['Dominant_Topic', '%_Contribution', 'Topic_Keywords', 'Text']
return topics_df
df_dominant_topic = dominant_topic(a1_b1_k20, corpus, df_train['text'])
df_dominant_topic.head(10)
Original submission and tree text
print(df_train["text"][0])
Assigned topics oredered by their contribution
sorted(a1_b1_k20[corpus][0][0], key= lambda x: (x[1]), reverse=True)
df_topic_sorted = pd.DataFrame()
df_topic_grouped = df_dominant_topic.groupby('Dominant_Topic')
for i, grp in df_topic_grouped:
# populate the sorted dataframe with the document that contributed the most to the topic
df_topic_sorted = pd.concat([df_topic_sorted, grp.sort_values(['%_Contribution'], ascending = [0]).head(1)], axis = 0)
# Reset Index and change columns name
df_topic_sorted.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
df_topic_sorted.loc[df_topic_sorted["Topic_Num"].isin([19, 18, 9, 7, 4, 16, 15, 11, 5])]
Most representative text for topic 19 (Broad topic regarding Automation - Devices - Network):
print(df_train["text"][3327])
Most representative text for topic 4 (Audio - Speakers):
print(df_train["text"][775])
Verify assigned topics oredered by their contribution for the following text
print(df_train["text"][5860])
sorted(a1_b1_k20[corpus][5860][0], key= lambda x: (x[1]), reverse=True)
a1_b1_k20.get_term_topics('security', minimum_probability=0.001)
a1_b1_k20.get_term_topics('privacy', minimum_probability=0.0001)
a1_b1_k20.get_term_topics('trust', minimum_probability=0.0001)
a1_b1_k20.get_term_topics('connectivity', minimum_probability=0.0001)
a1_b1_k20.get_term_topics('personal', minimum_probability=0.0001)
# To check how widely a topic was discussed
# Number of documents for each topic
topic_counts = df_dominant_topic['Dominant_Topic'].value_counts()
# Percentage of Documents for each Topic
topic_contribution = np.round(topic_counts/topic_counts.sum(), 4)
# Topic Number and Keywords
topic_num_keywords = df_topic_sorted[['Topic_Num', 'Keywords']].set_index(df_topic_sorted['Topic_Num'])
df_dominant_topics = pd.concat([topic_num_keywords, topic_counts, topic_contribution], axis = 1)
df_dominant_topics.reset_index(drop = True, inplace = True)
df_dominant_topics.columns = ['Topic_Num', 'Topic_Keywords', 'Num_Documents', 'Perc_Documents']
df_dominant_topics
# with the size of the words proportional to the weight
from wordcloud import WordCloud
cloud = WordCloud(background_color='white',
width=2500,
height=1800,
max_words=50,
color_func=lambda *args, **kwargs: list(list(mcolors.TABLEAU_COLORS.values())*2)[i],
prefer_horizontal=1.0)
topics = a1_b1_k20.show_topics(num_topics=20, num_words=50, formatted=False)
topics = [(x, dist) for (x, dist) in topics if x in [19, 18, 9, 7, 4, 16, 15, 11, 5]]
topic_num = [x for (x, dist) in topics]
fig, axes = plt.subplots(3, 3, figsize=(70,100), sharex=True, sharey=True)
for i, ax in enumerate(axes.flatten()):
fig.add_subplot(ax)
topic_words = dict(topics[i][1])
cloud.generate_from_frequencies(topic_words, max_font_size=300)
plt.gca().imshow(cloud)
plt.gca().set_title('Topic ' + str(topic_num[i]), fontdict=dict(size=200))
plt.gca().axis('off')
plt.subplots_adjust(wspace=0, hspace=0.01)
plt.axis('off')
plt.margins(x=0, y=0)
plt.tight_layout()
plt.show()
# Keep an eye on common words that occur in multiple topics and the one
# whose relative frequency is more than the weight. >> those should be added to stop_words
topics = a1_b1_k20.show_topics(num_topics = 20, num_words = 20, formatted=False)
topics = [(x, dist) for (x, dist) in topics if x in [19, 18, 9, 7, 4, 16, 15, 11, 5]]
topic_num = [x for (x, dist) in topics]
def corpus2token_text(corpus, dictionary):
nested_doc = []
texts = []
for doc in corpus:
nested_doc.append([[dictionary[k]]*v for (k, v) in doc])
for doc in nested_doc:
texts.append([item for sublist in doc for item in sublist])
return texts
texts = corpus2token_text(corpus, dictionary)
data_flat = [word for doc in texts for word in doc]
# words stored as dict keys and their count as dict values
counter = Counter(data_flat)
out = []
for num, dist in topics:
# relative weight to the topic
for word, weight in dist:
out.append([word, num, weight, counter[word]])
df = pd.DataFrame(out, columns=['word', 'topic_id', 'weight', 'word_count'])
# Plot Word Count and Weights of Topic Keywords
fig, axes = plt.subplots(3, 3, figsize=(150,100), sharey=True, dpi=160)
cols = list(list(mcolors.TABLEAU_COLORS.values())*2)
for i, ax in enumerate(axes.flatten()):
ax.bar(x='word', height="word_count", data=df.loc[df.topic_id==topic_num[i], :], color=cols[i], width=0.5, alpha=0.3, label='Word Count')
ax_twin = ax.twinx()
ax_twin.bar(x='word', height="weight", data=df.loc[df.topic_id==topic_num[i], :], color=cols[i], width=0.2, label='Weights')
ax.set_ylabel('Word Count', color=cols[i], fontsize=70)
ax_twin.set_ylim(0.0001, 0.0500); ax.set_ylim(0, 10000)
ax.set_title('Topic: ' + str(topic_num[i]), color=cols[i], fontsize=100)
ax.tick_params(axis='y', left=False, labelsize=70)
ax_twin.tick_params(axis='y', labelsize=70)
ax.set_xticklabels(df.loc[df.topic_id==topic_num[i], 'word'], rotation=30, horizontalalignment= 'right', fontsize=70)
ax.legend(loc='upper left', fontsize=70); ax_twin.legend(loc='upper right', fontsize=70)
fig.tight_layout(w_pad=2)
fig.suptitle('Word Count and weight of Topic Keywords', fontsize=200, y=1.05)
plt.show()